Sys.setenv(XML_CONFIG="/usr/bin/xml2-config")
# Package names
packages_1 <- c("dplyr", "tidyverse", "pheatmap", "stats", "gplots", "dendextend", "gtsummary", "colorRamp2", "survival", "survminer", "readxl", "RColorBrewer", "rpart", "forcats", "factoextra", "glmnet", "caret", "randomForest", "webshot", "htmlwidgets", "webshot", "pROC")
packages_2 <- c("GO.db", "HDO.db", "ComplexHeatmap", "ConsensusClusterPlus", "clusterProfiler", "edgeR", "biomaRt", "XML", "limma", "org.Hs.eg.db")
# Install packages not yet installed
installed_packages <- packages_1 %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
install.packages(packages_1[!installed_packages])
}
# Install BiocManager not yet installed
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
# Install packages not yet installed
installed_packages <- packages_2 %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
BiocManager::install(packages_2[!installed_packages])
}
# Packages loading
invisible(lapply(packages_1, library, character.only = TRUE))
invisible(lapply(packages_2, library, character.only = TRUE))
samples <- read.delim("~/TCGA_SKCM/skcm_tcga/data_clinical_sample.txt", skip = 4)
df_s <- unique(samples)
# Filtering and transforming data
df_s <- df_s %>%
distinct(PATIENT_ID, .keep_all = TRUE) %>%
dplyr::select(PATIENT_ID, SAMPLE_ID, DAYS_TO_COLLECTION, SAMPLE_TYPE, TMB_NONSYNONYMOUS) %>%
mutate(METASTASIS = case_when(
SAMPLE_TYPE == "Primary" ~ 0,
SAMPLE_TYPE == "Metastasis" ~ 1
)) %>%
mutate(log10TMB = log10(TMB_NONSYNONYMOUS)) %>%
dplyr::select(-SAMPLE_TYPE, -TMB_NONSYNONYMOUS) %>%
mutate_all(~gsub("\\[Not Available\\]", NA, .)) %>%
mutate(DAYS_TO_COLLECTION = as.numeric(DAYS_TO_COLLECTION),
METASTASIS = as.numeric(METASTASIS),
log10TMB = as.numeric(log10TMB)
)
head(df_s)
## PATIENT_ID SAMPLE_ID DAYS_TO_COLLECTION METASTASIS log10TMB
## 1 TCGA-BF-A1PU TCGA-BF-A1PU-01 338 0 0.3862016
## 2 TCGA-BF-A1PV TCGA-BF-A1PV-01 106 0 0.9311187
## 3 TCGA-BF-A1PX TCGA-BF-A1PX-01 122 0 1.0114295
## 4 TCGA-BF-A1PZ TCGA-BF-A1PZ-01 97 0 0.8711836
## 5 TCGA-BF-A1Q0 TCGA-BF-A1Q0-01 75 0 1.2855573
## 6 TCGA-D3-A1Q1 TCGA-D3-A1Q1-06 2305 1 0.2304489
rm(samples)
patients <- read.delim("~/TCGA_SKCM/skcm_tcga/data_clinical_patient.txt", skip=4)
df_p <- unique(patients)
# Filtering and transforming data
df_p <- df_p %>%
distinct(PATIENT_ID, .keep_all = TRUE) %>%
dplyr::select(PATIENT_ID, SEX, HEIGHT, WEIGHT, RACE, TUMOR_STATUS, CLARK_LEVEL_AT_DIAGNOSIS, OS_STATUS, OS_MONTHS, DFS_STATUS, DFS_MONTHS, AGE) %>%
mutate(MALE = case_when(
SEX == "Male" ~ 1,
SEX == "Female" ~ 0
)) %>%
mutate(CLARK_LEVEL = case_when(
CLARK_LEVEL_AT_DIAGNOSIS == "I" ~ 1,
CLARK_LEVEL_AT_DIAGNOSIS == "II" ~ 2,
CLARK_LEVEL_AT_DIAGNOSIS == "III" ~ 3,
CLARK_LEVEL_AT_DIAGNOSIS == "IV" ~ 4,
)) %>%
mutate(DECEASED = case_when(
OS_STATUS == "1:DECEASED" ~ 1,
OS_STATUS == "0:LIVING" ~ 0
)) %>%
mutate(RECURRED_PROGRESSED = case_when(
DFS_STATUS == "1:Recurred/Progressed" ~ 1,
DFS_STATUS == "0:DiseaseFree" ~ 0
))%>%
mutate(TUMOR_FREE = case_when(
TUMOR_STATUS == "TUMOR FREE" ~ 1,
TUMOR_STATUS == "WITH TUMOR" ~ 0
))%>%
dplyr::select(-SEX, -CLARK_LEVEL_AT_DIAGNOSIS,-OS_STATUS, -DFS_STATUS, -TUMOR_STATUS) %>%
mutate_all(~gsub("\\[Not Available\\]", NA, .)) %>%
mutate(HEIGHT = as.numeric(HEIGHT),
WEIGHT = as.numeric(WEIGHT),
OS_MONTHS = as.numeric(OS_MONTHS),
AGE = as.numeric(AGE),
DFS_MONTHS = as.numeric(DFS_MONTHS),
MALE = as.numeric(MALE),
CLARK_LEVEL = as.numeric(CLARK_LEVEL),
DECEASED = as.numeric(DECEASED),
RECURRED_PROGRESSED = as.numeric(RECURRED_PROGRESSED),
TUMOR_FREE = as.numeric(TUMOR_FREE)
)
head(df_p)
## PATIENT_ID HEIGHT WEIGHT RACE OS_MONTHS DFS_MONTHS AGE MALE CLARK_LEVEL
## 1 TCGA-3N-A9WB 175 78 WHITE 17.02 16.00 71 1 3
## 2 TCGA-3N-A9WC 183 68 WHITE 66.43 56.01 82 1 4
## 3 TCGA-3N-A9WD 183 116 WHITE 12.98 10.05 82 1 3
## 4 TCGA-BF-A1PU 160 58 WHITE 12.71 15.90 46 0 3
## 5 TCGA-BF-A1PV 160 70 WHITE 0.46 0.46 74 0 4
## 6 TCGA-BF-A1PX 175 78 WHITE 9.26 NA 56 1 3
## DECEASED RECURRED_PROGRESSED TUMOR_FREE
## 1 1 1 0
## 2 0 1 0
## 3 1 1 0
## 4 0 1 1
## 5 0 0 1
## 6 1 NA 1
rm(patients)
original_data <- read_excel("~/TCGA_SKCM/original_data/mmc2.xlsx", sheet = 4, skip = 1)
# Filtering and transforming data
original_data <- original_data %>%
dplyr::rename('SAMPLE_ID' = 'Name') %>%
dplyr::select(SAMPLE_ID, MUTATIONSUBTYPES, REGIONAL_VS_PRIMARY, `RNASEQ-CLUSTER_CONSENHIER`, MethTypes.201408, PIGMENT.SCORE, NECROSIS, LYMPHOCYTE.SCORE, `UV-RATE`) %>%
dplyr::rename('RNASEQ_CLUSTER_CONSENHIER' = 'RNASEQ-CLUSTER_CONSENHIER') %>%
mutate(TISSUE_ORIGIN = case_when(
REGIONAL_VS_PRIMARY == "Primary_Disease" ~ 'Primary',
REGIONAL_VS_PRIMARY == "Regional_Skin_or_Soft_Tissue" ~ 'Regional_non_LN',
REGIONAL_VS_PRIMARY == "Distant_Metastases" ~ 'Distant_Metastases',
REGIONAL_VS_PRIMARY == "Regional_Lymph_Node" ~ 'Regional_LN'
)) %>%
dplyr::rename('UV_RATE' = 'UV-RATE') %>%
dplyr::select(-REGIONAL_VS_PRIMARY) %>%
mutate_all(~gsub("\\[Not Available\\]", NA, .)) %>%
mutate(
PIGMENT.SCORE = as.numeric(PIGMENT.SCORE),
NECROSIS = as.numeric(NECROSIS),
LYMPHOCYTE.SCORE = as.numeric(LYMPHOCYTE.SCORE),
UV_RATE = as.numeric(UV_RATE)
) %>%
dplyr::filter(RNASEQ_CLUSTER_CONSENHIER != '-')
subset_indices1 <- original_data$RNASEQ_CLUSTER_CONSENHIER == "-"
original_data$RNASEQ_CLUSTER_CONSENHIER[subset_indices1] <- NA
subset_indices2 <- original_data$MUTATIONSUBTYPES == "-"
original_data$MUTATIONSUBTYPES[subset_indices2] <- NA
head(original_data)
## # A tibble: 6 × 9
## SAMPLE_ID MUTATIONSUBTYPES RNASEQ_CLUSTER_CONSENH…¹ MethTypes.201408
## <chr> <chr> <chr> <chr>
## 1 TCGA-BF-A1PU-01 BRAF_Hotspot_Mutants keratin normal-like
## 2 TCGA-BF-A1PV-01 RAS_Hotspot_Mutants keratin CpG island-meth…
## 3 TCGA-BF-A1PX-01 BRAF_Hotspot_Mutants keratin normal-like
## 4 TCGA-BF-A1PZ-01 RAS_Hotspot_Mutants keratin hypo-methylated
## 5 TCGA-BF-A1Q0-01 Triple_WT immune CpG island-meth…
## 6 TCGA-BF-A3DJ-01 BRAF_Hotspot_Mutants keratin hypo-methylated
## # ℹ abbreviated name: ¹RNASEQ_CLUSTER_CONSENHIER
## # ℹ 5 more variables: PIGMENT.SCORE <dbl>, NECROSIS <dbl>,
## # LYMPHOCYTE.SCORE <dbl>, UV_RATE <dbl>, TISSUE_ORIGIN <chr>
# Merging df_s with df_p on 'PATIENT_ID' to combine related data from both dataframes
df_ps <-
left_join(df_s, df_p, by = "PATIENT_ID") %>%
# Filtering out specific samples based on their 'SAMPLE_ID' because there are not into mat
filter(SAMPLE_ID != "TCGA-GN-A269-01") %>%
filter(SAMPLE_ID != "TCGA-GN-A261-06")
# Merging the previously obtained dataframe (df_ps) with another dataframe (original_data) on 'SAMPLE_ID'
df_meta <-
left_join(original_data, df_ps, by = "SAMPLE_ID")
# Modify LYMPHOCYTE SCORE according to the article
df_meta <- df_meta %>%
mutate(IMMUNE.SCORE = case_when(
is.na(LYMPHOCYTE.SCORE) ~ as.character(NA),
LYMPHOCYTE.SCORE %in% c("1", "2") ~ "1-2",
LYMPHOCYTE.SCORE %in% c("3", "4") ~ "3-4",
TRUE ~ "5-6"))
df_meta %>%
ggplot(mapping = aes(x = AGE)) +
geom_histogram()
summary(df_meta$AGE)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 15.0 46.0 57.0 57.2 70.0 90.0 6
df_meta %>%
ggplot(mapping = aes(WEIGHT)) +
geom_histogram()
summary(df_meta$WEIGHT)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 40.00 70.00 81.00 82.67 90.00 161.00 185
df_meta %>%
ggplot(mapping = aes(x = HEIGHT)) +
geom_histogram()
summary(df_meta$HEIGHT)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 146 164 170 170 178 193 192
df_meta %>%
ggplot(mapping = aes(x = log10TMB)) +
geom_histogram()
summary(df_meta$log10TMB)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -0.6320 0.6484 0.9971 0.9472 1.2518 3.0254 9
# Read and preprocess the data
df_t <-
read_delim("~/TCGA_SKCM/skcm_tcga/data_mrna_seq_v2_rsem.txt") %>%
dplyr::select(-Hugo_Symbol) %>%
dplyr::select(Entrez_Gene_Id, df_meta$SAMPLE_ID) %>%
distinct(Entrez_Gene_Id, .keep_all = TRUE)
# Transpose and set row and column names
mat <- t(df_t[-1])
colnames(mat) <- df_t$Entrez_Gene_Id
rownames(mat) <- colnames(df_t)[-1]
# Log Transformation
mat <- log2(mat + 1)
# Feature selection based on variability
features <-
apply(mat, 2, mad) %>%
sort(decreasing = T) %>%
.[1:1500] %>%
names()
# Normalize the data
mat <- apply(mat, 2, function(x) (x - median(x))/mad(x))
# Reduce the matrix to selected features
mat <- mat[, features]
# Perform hierarchical clustering on samples and cut the dendrogram to form 'k.smpl' clusters
smpl.grp <- cutree(smpl.dend, k = k.smpl)
# Visualize the sample clusters
fviz_cluster(list(data = mat, cluster = smpl.grp),
palette = "Set1",
geom = "point",
ellipse.type = "convex",
show.clust.cent = FALSE,
main = "Sample clusters",
ggtheme = theme_minimal())
# Perform hierarchical clustering on genes and cut the dendrogram to form 'k.gene' clusters
gene.grp <- cutree(gene.dend, k = k.gene)
# Visualize the gene clusters
fviz_cluster(list(data = t(mat), cluster = gene.grp),
palette = "Set1",
geom = "point",
ellipse.type = "convex",
show.clust.cent = FALSE,
main = "Gene clusters",
ggtheme = theme_minimal())
# Convert the matrix 'mat' to a data frame
mat_df <- as.data.frame(mat)
# Append cluster labels as a new column
mat_df$labels <- as.factor(smpl.grp)
# Ensure that labels are factors with appropriate levels
mat_df$labels <- factor(mat_df$labels, levels = c("1", "2", "3"))
# Select genes
genes <- colnames(mat_df)
# Perform Kruskal-Wallis test for each gene
res <- sapply(genes, function(x) {
model <- kruskal.test(mat_df[[x]] ~ mat_df$labels)
p <- model$p.value
return(p)
})
# Set names for the result vector
names(res) <- genes
# Adjust p-values for multiple testing using Benjamini-Hochberg procedure
adjusted_p <- p.adjust(res, method = "BH")
# Define a significance threshold
threshold <- 0.001
# Identify significant genes for each label
significant_genes <- list()
for (label in levels(mat_df$labels)) {
sig_genes <- names(adjusted_p)[adjusted_p < threshold & mat_df$labels == label]
significant_genes[[paste("Label", label, sep = "_")]] <- sig_genes
}
# Querying ensembl for gene information
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
# Retrieving gene symbols
symbols_1 <- AnnotationDbi::select(org.Hs.eg.db,
keys = as.character(significant_genes[[1]]),
columns = c("SYMBOL"),
keytype = "ENTREZID") %>% na.omit()
# Preparing to query ensembl
values_to_query <- symbols_1$ENTREZID
# Expand attributes to include functional information
genes_info_1 <- getBM(attributes = c('gene_biotype', 'entrezgene_id', 'external_gene_name',
'description'),
filters = 'entrezgene_id',
values = values_to_query,
mart = ensembl)
# Check the first few rows to confirm it worked as expected
head(genes_info_1)
## gene_biotype entrezgene_id external_gene_name
## 1 protein_coding 1001 CDH3
## 2 lncRNA 100124700 HOTAIR
## 3 protein_coding 100128553 CTAGE4
## 4 protein_coding 100133941 CD24
## 5 protein_coding 10098 TSPAN5
## 6 protein_coding 10149 ADGRG2
## description
## 1 cadherin 3 [Source:HGNC Symbol;Acc:HGNC:1762]
## 2 HOX transcript antisense RNA [Source:HGNC Symbol;Acc:HGNC:33510]
## 3 CTAGE family member 4 [Source:HGNC Symbol;Acc:HGNC:24772]
## 4 CD24 molecule [Source:HGNC Symbol;Acc:HGNC:1645]
## 5 tetraspanin 5 [Source:HGNC Symbol;Acc:HGNC:17753]
## 6 adhesion G protein-coupled receptor G2 [Source:HGNC Symbol;Acc:HGNC:4516]
# Performing GO enrichment analysis
ego1 <- enrichGO(gene = symbols_1$SYMBOL,
OrgDb = org.Hs.eg.db,
keyType = "SYMBOL",
ont = "BP",
pAdjustMethod = "BH",
qvalueCutoff = 0.01)
# Open a PNG graphics device, specifying the file name and dimensions
png("~/TCGA_SKCM/dot_plot_1.png", width = 1200, height = 800)
# Generate the dot plot
clusterProfiler::dotplot(ego1, showCategory = 20)
# Close the device to save the plot to the file
dev.off()
## quartz_off_screen
## 2
symbols_2 <- AnnotationDbi::select(org.Hs.eg.db,
keys = as.character(significant_genes[[2]]),
columns = c("SYMBOL"),
keytype = "ENTREZID") %>% na.omit()
values_to_query_2 <- symbols_2$ENTREZID
genes_info_2 <- getBM(attributes = c('gene_biotype', 'entrezgene_id', 'external_gene_name',
'description'),
filters = 'entrezgene_id',
values = values_to_query_2,
mart = ensembl)
head(genes_info_2)
## gene_biotype entrezgene_id external_gene_name
## 1 protein_coding 1000 CDH2
## 2 protein_coding 100131187 TSTD1
## 3 lncRNA 100233209 PCED1B-AS1
## 4 protein_coding 10085 EDIL3
## 5 protein_coding 10125 RASGRP1
## 6 protein_coding 10178 TENM1
## description
## 1 cadherin 2 [Source:HGNC Symbol;Acc:HGNC:1759]
## 2 thiosulfate sulfurtransferase like domain containing 1 [Source:HGNC Symbol;Acc:HGNC:35410]
## 3 PCED1B antisense RNA 1 [Source:HGNC Symbol;Acc:HGNC:44166]
## 4 EGF like repeats and discoidin domains 3 [Source:HGNC Symbol;Acc:HGNC:3173]
## 5 RAS guanyl releasing protein 1 [Source:HGNC Symbol;Acc:HGNC:9878]
## 6 teneurin transmembrane protein 1 [Source:HGNC Symbol;Acc:HGNC:8117]
ego2 <- enrichGO(gene = symbols_2$SYMBOL,
OrgDb = org.Hs.eg.db,
keyType = "SYMBOL",
ont = "BP",
pAdjustMethod = "BH",
qvalueCutoff = 0.005)
png("~/TCGA_SKCM/dot_plot_2.png", width = 1200, height = 800)
clusterProfiler::dotplot(ego2, showCategory = 20)
dev.off()
## quartz_off_screen
## 2
symbols_3 <- AnnotationDbi::select(org.Hs.eg.db,
keys = as.character(significant_genes[[3]]),
columns = c("SYMBOL"),
keytype = "ENTREZID") %>% na.omit()
values_to_query_3 <- symbols_3$ENTREZID
genes_info_3 <- getBM(attributes = c('gene_biotype', 'entrezgene_id', 'external_gene_name',
'description'),
filters = 'entrezgene_id',
values = values_to_query_3,
mart = ensembl)
head(genes_info_3)
## gene_biotype entrezgene_id external_gene_name
## 1 lncRNA 100190938 RAMP2-AS1
## 2 protein_coding 10082 GPC6
## 3 protein_coding 10170 DHRS9
## 4 protein_coding 10225 CD96
## 5 protein_coding 10351 ABCA8
## 6 protein_coding 10481 HOXB13
## description
## 1 RAMP2 antisense RNA 1 [Source:HGNC Symbol;Acc:HGNC:44358]
## 2 glypican 6 [Source:HGNC Symbol;Acc:HGNC:4454]
## 3 dehydrogenase/reductase 9 [Source:HGNC Symbol;Acc:HGNC:16888]
## 4 CD96 molecule [Source:HGNC Symbol;Acc:HGNC:16892]
## 5 ATP binding cassette subfamily A member 8 [Source:HGNC Symbol;Acc:HGNC:38]
## 6 homeobox B13 [Source:HGNC Symbol;Acc:HGNC:5112]
ego3 <- enrichGO(gene = symbols_3$SYMBOL,
OrgDb = org.Hs.eg.db,
keyType = "SYMBOL",
ont = "BP",
pAdjustMethod = "BH",
qvalueCutoff = 0.05)
png("~/TCGA_SKCM/dot_plot_3.png", width = 1200, height = 800)
clusterProfiler::dotplot(ego3, showCategory = 20)
dev.off()
## quartz_off_screen
## 2
# Append cluster labels as a new column
mat_df$labels <- as.factor(smpl.grp)
# Prepare labels for replacement
label_replacements <- c("1" = "keratin", "2" = "immune", "3" = "MIFT-low")
# Replace numeric labels with descriptive labels
mat_df$labels <- factor(mat_df$labels, levels = names(label_replacements), labels = label_replacements)
# Merge two dataframes
merged_df <- merge(x = mat_df, y = df_meta, by.x = "row.names", by.y = "SAMPLE_ID", all.x = TRUE)
# Rename the first column to "row.names"
colnames(merged_df)[1] <- "SAMPLE_ID"
# See how many clusters are the same
matching_counts <- table(merged_df$RNASEQ_CLUSTER_CONSENHIER == merged_df$labels)
matching_counts
##
## FALSE TRUE
## 129 200
# Assign biological interpretations to clusters
smpl.grp <- cutree(smpl.dend, k = k.smpl)
smpl.grp <- case_when(smpl.grp == 1 ~ "keratin",
smpl.grp == 2 ~ "immune",
smpl.grp == 3 ~ "MFTI-low")
# Prepare the data frame for analysis
df <- df_meta %>%
mutate(CLUSTER = as.factor(smpl.grp)) %>%
distinct(PATIENT_ID, .keep_all = T) %>%
filter(!is.na(DECEASED))
surv_obj <- Surv(time = df$OS_MONTHS/12, event = df$DECEASED)
# Fit Kaplan-Meier survival curve
km_fit <- survfit(surv_obj ~ df$CLUSTER)
ggsurvplot(km_fit,
conf.int = F, # Show confidence interval
risk.table = F, # Add a table of the number of subjects at risk at each time point
xlab = "Time (years)", # Customize X-axis label
ylab = "Cumulative Survival Rate", # Customize Y-axis label
ggtheme = theme_minimal(),
data = df) # Use a minimal theme for the plot
# Merging df_s with df_p on 'PATIENT_ID' to combine related data from both dataframes
df_meta_2 <-
left_join(df_s, df_p, by = "PATIENT_ID") %>%
# Filtering out specific samples based on their 'SAMPLE_ID' because there are not into mat
dplyr::filter(SAMPLE_ID != "TCGA-GN-A269-01") %>%
dplyr::filter(SAMPLE_ID != "TCGA-GN-A261-06")
# Read and preprocess the data
df_t <-
read_delim("~/TCGA_SKCM/skcm_tcga/data_mrna_seq_v2_rsem.txt") %>%
dplyr::select(-Hugo_Symbol) %>%
dplyr::select(Entrez_Gene_Id, df_meta_2$SAMPLE_ID) %>%
distinct(Entrez_Gene_Id, .keep_all = TRUE)
# Transpose and set row and column names
mat <- t(df_t[-1])
colnames(mat) <- df_t$Entrez_Gene_Id
rownames(mat) <- colnames(df_t)[-1]
# Log Transformation
mat <- log2(mat + 1)
# Feature selection based on variability
features <-
apply(mat, 2, mad) %>%
sort(decreasing = T) %>%
.[1:1500] %>%
names()
# Normalize the data
mat <- apply(mat, 2, function(x) (x - median(x))/mad(x))
# Reduce the matrix to selected features
mat <- mat[, features]
# Perform hierarchical clustering on samples and cut the dendrogram to form 'k.smpl' clusters
smpl.grp <- cutree(smpl.dend, k = k.smpl)
# Visualize the sample clusters
fviz_cluster(list(data = mat, cluster = smpl.grp),
palette = "Set1",
geom = "point",
ellipse.type = "convex",
show.clust.cent = FALSE,
main = "Sample clusters",
ggtheme = theme_minimal())
# Perform hierarchical clustering on genes and cut the dendrogram to form 'k.gene' clusters
gene.grp <- cutree(gene.dend, k = k.gene)
# Visualize the gene clusters
fviz_cluster(list(data = t(mat), cluster = gene.grp),
palette = "Set1",
geom = "point",
ellipse.type = "convex",
show.clust.cent = FALSE,
main = "Gene clusters",
ggtheme = theme_minimal())
#### Kruskal-Wallis Test is used for comparing two or more independent
samples of equal or different sample sizes
# Convert the matrix 'mat' to a data frame
mat_df <- as.data.frame(mat)
# Append cluster labels as a new column
mat_df$labels <- as.factor(smpl.grp)
# Ensure that labels are factors with appropriate levels
mat_df$labels <- factor(mat_df$labels, levels = c("1", "2", "3"))
# Select genes
genes <- colnames(mat_df)
# Perform Kruskal-Wallis test for each gene
res <- sapply(genes, function(x) {
model <- kruskal.test(mat_df[[x]] ~ mat_df$labels)
p <- model$p.value
return(p)
})
# Set names for the result vector
names(res) <- genes
# Adjust p-values for multiple testing using Benjamini-Hochberg procedure
adjusted_p <- p.adjust(res, method = "BH")
# Define a significance threshold
threshold <- 0.001
# Identify significant genes for each label
significant_genes <- list()
for (label in levels(mat_df$labels)) {
sig_genes <- names(adjusted_p)[adjusted_p < threshold & mat_df$labels == label]
significant_genes[[paste("Label", label, sep = "_")]] <- sig_genes
}
# Querying ensembl for gene information
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
# Retrieving gene symbols
symbols_1 <- AnnotationDbi::select(org.Hs.eg.db,
keys = as.character(significant_genes[[1]]),
columns = c("SYMBOL"),
keytype = "ENTREZID") %>% na.omit()
# Preparing to query ensembl
values_to_query <- symbols_1$ENTREZID
# Expand attributes to include functional information
genes_info_1 <- getBM(attributes = c('gene_biotype', 'entrezgene_id', 'external_gene_name',
'description'),
filters = 'entrezgene_id',
values = values_to_query,
mart = ensembl)
# Check the first few rows to confirm it worked as expected
head(genes_info_1)
## gene_biotype entrezgene_id external_gene_name
## 1 protein_coding 1001 CDH3
## 2 lncRNA 100124700 HOTAIR
## 3 protein_coding 100133941 CD24
## 4 protein_coding 10082 GPC6
## 5 protein_coding 10225 CD96
## 6 protein_coding 10361 NPM2
## description
## 1 cadherin 3 [Source:HGNC Symbol;Acc:HGNC:1762]
## 2 HOX transcript antisense RNA [Source:HGNC Symbol;Acc:HGNC:33510]
## 3 CD24 molecule [Source:HGNC Symbol;Acc:HGNC:1645]
## 4 glypican 6 [Source:HGNC Symbol;Acc:HGNC:4454]
## 5 CD96 molecule [Source:HGNC Symbol;Acc:HGNC:16892]
## 6 nucleophosmin/nucleoplasmin 2 [Source:HGNC Symbol;Acc:HGNC:7930]
# Performing GO enrichment analysis
ego1 <- enrichGO(gene = symbols_1$SYMBOL,
OrgDb = org.Hs.eg.db,
keyType = "SYMBOL",
ont = "BP",
pAdjustMethod = "BH",
qvalueCutoff = 0.005)
# Open a PNG graphics device, specifying the file name and dimensions
png("~/TCGA_SKCM/dot_plot_4.png", width = 1200, height = 800)
# Generate the dot plot
clusterProfiler::dotplot(ego1, showCategory = 20)
# Close the device to save the plot to the file
dev.off()
## quartz_off_screen
## 2
symbols_2 <- AnnotationDbi::select(org.Hs.eg.db,
keys = as.character(significant_genes[[2]]),
columns = c("SYMBOL"),
keytype = "ENTREZID") %>% na.omit()
values_to_query_2 <- symbols_2$ENTREZID
genes_info_2 <- getBM(attributes = c('gene_biotype', 'entrezgene_id', 'external_gene_name',
'description'),
filters = 'entrezgene_id',
values = values_to_query_2,
mart = ensembl)
head(genes_info_2)
## gene_biotype entrezgene_id external_gene_name
## 1 protein_coding 100131187 TSTD1
## 2 protein_coding 10098 TSPAN5
## 3 protein_coding 10148 EBI3
## 4 protein_coding 10351 ABCA8
## 5 protein_coding 10391 CORO2B
## 6 protein_coding 10900 RUNDC3A
## description
## 1 thiosulfate sulfurtransferase like domain containing 1 [Source:HGNC Symbol;Acc:HGNC:35410]
## 2 tetraspanin 5 [Source:HGNC Symbol;Acc:HGNC:17753]
## 3 Epstein-Barr virus induced 3 [Source:HGNC Symbol;Acc:HGNC:3129]
## 4 ATP binding cassette subfamily A member 8 [Source:HGNC Symbol;Acc:HGNC:38]
## 5 coronin 2B [Source:HGNC Symbol;Acc:HGNC:2256]
## 6 RUN domain containing 3A [Source:HGNC Symbol;Acc:HGNC:16984]
ego2 <- enrichGO(gene = symbols_2$SYMBOL,
OrgDb = org.Hs.eg.db,
keyType = "SYMBOL",
ont = "BP",
pAdjustMethod = "BH",
qvalueCutoff = 0.05)
png("~/TCGA_SKCM/dot_plot_5.png", width = 1200, height = 800)
clusterProfiler::dotplot(ego2, showCategory = 20)
dev.off()
## quartz_off_screen
## 2
symbols_3 <- AnnotationDbi::select(org.Hs.eg.db,
keys = as.character(significant_genes[[3]]),
columns = c("SYMBOL"),
keytype = "ENTREZID") %>% na.omit()
values_to_query_3 <- symbols_3$ENTREZID
genes_info_3 <- getBM(attributes = c('gene_biotype', 'entrezgene_id', 'external_gene_name',
'description'),
filters = 'entrezgene_id',
values = values_to_query_3,
mart = ensembl)
head(genes_info_3)
## gene_biotype entrezgene_id external_gene_name
## 1 protein_coding 10004 NAALADL1
## 2 lncRNA 100190938 RAMP2-AS1
## 3 lncRNA 100233209 PCED1B-AS1
## 4 protein_coding 10085 EDIL3
## 5 protein_coding 10178 TENM1
## 6 protein_coding 10278 EFS
## description
## 1 N-acetylated alpha-linked acidic dipeptidase like 1 [Source:HGNC Symbol;Acc:HGNC:23536]
## 2 RAMP2 antisense RNA 1 [Source:HGNC Symbol;Acc:HGNC:44358]
## 3 PCED1B antisense RNA 1 [Source:HGNC Symbol;Acc:HGNC:44166]
## 4 EGF like repeats and discoidin domains 3 [Source:HGNC Symbol;Acc:HGNC:3173]
## 5 teneurin transmembrane protein 1 [Source:HGNC Symbol;Acc:HGNC:8117]
## 6 embryonal Fyn-associated substrate [Source:HGNC Symbol;Acc:HGNC:16898]
ego3 <- enrichGO(gene = symbols_3$SYMBOL,
OrgDb = org.Hs.eg.db,
keyType = "SYMBOL",
ont = "BP",
pAdjustMethod = "BH",
qvalueCutoff = 0.05)
png("~/TCGA_SKCM/dot_plot_6.png", width = 1200, height = 800)
clusterProfiler::dotplot(ego3, showCategory = 20)
dev.off()
## quartz_off_screen
## 2
# Append cluster labels as a new column
mat_df$labels <- as.factor(smpl.grp)
# Prepare labels for replacement
label_replacements <- c("1" = "keratin", "2" = "MIFT-low", "3" = "immune")
# Replace numeric labels with descriptive labels
mat_df$labels <- factor(mat_df$labels, levels = names(label_replacements), labels = label_replacements)
# Merge two dataframes
merged_df <- merge(x = mat_df, y = df_meta, by.x = "row.names", by.y = "SAMPLE_ID", all.x = TRUE)
# Rename the first column to "row.names"
colnames(merged_df)[1] <- "SAMPLE_ID"
# See how many clusters are the same
matching_counts <- table(merged_df$RNASEQ_CLUSTER_CONSENHIER == merged_df$labels)
matching_counts
##
## FALSE TRUE
## 127 200
# Assign biological interpretations to clusters
smpl.grp <- cutree(smpl.dend, k = k.smpl)
smpl.grp <- case_when(smpl.grp == 1 ~ "keratin",
smpl.grp == 2 ~ "MIFT-low",
smpl.grp == 3 ~ "immune")
# Prepare the data frame for analysis
df <- df_meta_2 %>%
mutate(CLUSTER = as.factor(smpl.grp)) %>%
distinct(PATIENT_ID, .keep_all = T) %>%
filter(!is.na(DECEASED))
surv_obj <- Surv(time = df$OS_MONTHS/12, event = df$DECEASED)
# Fit Kaplan-Meier survival curve
km_fit <- survfit(surv_obj ~ df$CLUSTER)
ggsurvplot(km_fit,
conf.int = F, # Show confidence interval
risk.table = F, # Add a table of the number of subjects at risk at each time point
xlab = "Time (years)", # Customize X-axis label
ylab = "Cumulative Survival Rate", # Customize Y-axis label
ggtheme = theme_minimal(),
data = df) # Use a minimal theme for the plot
class <- df_meta$RNASEQ_CLUSTER_CONSENHIER
# Read and preprocess the data
df_t <-
read_delim("~/TCGA_SKCM/skcm_tcga/data_mrna_seq_v2_rsem.txt") %>%
dplyr::select(-Hugo_Symbol) %>%
dplyr::select(Entrez_Gene_Id, df_meta$SAMPLE_ID) %>%
distinct(Entrez_Gene_Id, .keep_all = TRUE)
# Transpose and set row and column names
mat <- t(df_t[-1])
colnames(mat) <- df_t$Entrez_Gene_Id
rownames(mat) <- colnames(df_t)[-1]
# Log Transformation
mat <- log2(mat + 1)
# Feature selection based on variability
features <-
apply(mat, 2, mad) %>%
sort(decreasing = T) %>%
.[1:1500] %>%
names()
# Normalize the data
mat <- apply(mat, 2, function(x) (x - median(x))/mad(x))
# Reduce the matrix to selected features
mat <- mat[, features]
# Read and preprocess the data
df_t_2 <-
read_delim("~/TCGA_SKCM/skcm_tcga/data_mrna_seq_v2_rsem.txt") %>%
dplyr::select(-Hugo_Symbol) %>%
dplyr::select(Entrez_Gene_Id, df_meta_2$SAMPLE_ID) %>%
distinct(Entrez_Gene_Id, .keep_all = TRUE)
# Transpose and set row and column names
mat_2 <- t(df_t_2[-1])
colnames(mat_2) <- df_t_2$Entrez_Gene_Id
rownames(mat_2) <- colnames(df_t_2)[-1]
# Log Transformation
mat_2 <- log2(mat_2 + 1)
# Feature selection based on variability
features <-
apply(mat_2, 2, mad) %>%
sort(decreasing = T) %>%
.[1:1500] %>%
names()
# Normalize the data
mat_2 <- apply(mat_2, 2, function(x) (x - median(x))/mad(x))
# Reduce the matrix to selected features
mat_2 <- mat_2[, features]
# Ensure reproducibility
set.seed(123)
# Performe cross-validated logistic regression for multinomial outcomes
cv_fit <- cv.glmnet(as.matrix(mat), as.factor(class), family = "multinomial", alpha = 1)
plot(cv_fit)
best_lambda <- cv_fit$lambda.min
lasso_model <- glmnet(as.matrix(mat), as.factor(class), family = "multinomial", alpha = 1, lambda = best_lambda)
# Extract non-zero coefficients (features) from LASSO Model
coef_matrix <- coef(lasso_model, s = best_lambda)
non_zero_features <- unique(unlist(lapply(coef_matrix[-1], function(x) {
# Extract feature names for non-zero coefficients, excluding intercept
features <- rownames(x)[x@i[x@i != 0]]
if (length(features) > 0) return(features) else return(NULL)
})))
# Subset to include only selected features
mat_final <- mat[, non_zero_features]
common_features <- non_zero_features[non_zero_features %in% colnames(mat_2)]
# Subset to include only the common features
mat_2_final <- mat_2[, common_features]
mat_final <- mat[, common_features]
rna_data <- data.frame(mat_final)
dim(mat_final)
## [1] 329 49
dim(mat_2_final)
## [1] 469 49
# Split the data into training and testing sets
set.seed(123)
train_indices <- sample(nrow(rna_data), 0.7 * nrow(rna_data))
train_data <- rna_data[train_indices, ]
test_data <- rna_data[-train_indices, ]
train_labels <- class[train_indices]
test_labels <- class[-train_indices]
# Train the model using labels from df_meta
multinom_model <- cv.glmnet(as.matrix(train_data), as.factor(train_labels), family = "multinomial", alpha = 1)
# Predict on the test dataset and calculate accuracy
predictions <- predict(multinom_model, newx = as.matrix(test_data), type = "class")
confusionMatrix(factor(predictions), factor(test_labels))
## Confusion Matrix and Statistics
##
## Reference
## Prediction immune keratin MITF-low
## immune 47 4 1
## keratin 3 30 2
## MITF-low 1 0 11
##
## Overall Statistics
##
## Accuracy : 0.8889
## 95% CI : (0.8099, 0.9432)
## No Information Rate : 0.5152
## P-Value [Acc > NIR] : 2.252e-15
##
## Kappa : 0.8119
##
## Mcnemar's Test P-Value : 0.5433
##
## Statistics by Class:
##
## Class: immune Class: keratin Class: MITF-low
## Sensitivity 0.9216 0.8824 0.7857
## Specificity 0.8958 0.9231 0.9882
## Pos Pred Value 0.9038 0.8571 0.9167
## Neg Pred Value 0.9149 0.9375 0.9655
## Prevalence 0.5152 0.3434 0.1414
## Detection Rate 0.4747 0.3030 0.1111
## Detection Prevalence 0.5253 0.3535 0.1212
## Balanced Accuracy 0.9087 0.9027 0.8870
# Perform prediction on the new data matrix
predictions_new <- predict(multinom_model, newx = as.matrix(mat_2_final), type = "class")
# Align predictions with the sample identifiers
predictions_new_df <- data.frame(prediction = as.character(predictions_new), row.names = rownames(mat_2_final))
predictions_new_df <- tibble::rownames_to_column(predictions_new_df, var = "SAMPLE_ID")
# Print or inspect the aligned predictions
head(predictions_new_df)
## SAMPLE_ID prediction
## 1 TCGA-BF-A1PU-01 keratin
## 2 TCGA-BF-A1PV-01 keratin
## 3 TCGA-BF-A1PX-01 immune
## 4 TCGA-BF-A1PZ-01 keratin
## 5 TCGA-BF-A1Q0-01 immune
## 6 TCGA-D3-A1Q1-06 keratin
# Merge predicted classes with original metadata
predictions_with_class <- predictions_new_df %>%
left_join(df_meta %>% dplyr::select(SAMPLE_ID, RNASEQ_CLUSTER_CONSENHIER, PATIENT_ID, OS_MONTHS, DECEASED), by = "SAMPLE_ID")
# Calculate matching counts
matching_counts <- table(predictions_with_class$RNASEQ_CLUSTER_CONSENHIER == predictions_with_class$prediction)
matching_counts
##
## FALSE TRUE
## 21 306
# Prepare the data frame for analysis
df <- predictions_with_class %>%
distinct(PATIENT_ID, .keep_all = T) %>%
filter(!is.na(DECEASED)) %>%
mutate(CLUSTER = prediction)
surv_obj <- Surv(time = df$OS_MONTHS/12, event = df$DECEASED)
# Fit Kaplan-Meier survival curve
km_fit <- survfit(surv_obj ~ df$CLUSTER)
ggsurvplot(km_fit,
conf.int = F, # Show confidence interval
risk.table = F, # Add a table of the number of subjects at risk at each time point
xlab = "Time (years)", # Customize X-axis label
ylab = "Cumulative Survival Rate", # Customize Y-axis label
ggtheme = theme_minimal(),
data = df) # Use a minimal theme for the plot
# Train the Random Forest model
rf_model <- randomForest(x = train_data, y = as.factor(train_labels), ntree = 500)
# Predict on the test dataset
predictions_test_rf <- predict(rf_model, newdata = test_data)
# Calculate accuracy or confusion matrix for test predictions
confusionMatrix(predictions_test_rf, factor(test_labels))
## Confusion Matrix and Statistics
##
## Reference
## Prediction immune keratin MITF-low
## immune 49 6 1
## keratin 1 28 1
## MITF-low 1 0 12
##
## Overall Statistics
##
## Accuracy : 0.899
## 95% CI : (0.8221, 0.9505)
## No Information Rate : 0.5152
## P-Value [Acc > NIR] : 2.914e-16
##
## Kappa : 0.8276
##
## Mcnemar's Test P-Value : 0.206
##
## Statistics by Class:
##
## Class: immune Class: keratin Class: MITF-low
## Sensitivity 0.9608 0.8235 0.8571
## Specificity 0.8542 0.9692 0.9882
## Pos Pred Value 0.8750 0.9333 0.9231
## Neg Pred Value 0.9535 0.9130 0.9767
## Prevalence 0.5152 0.3434 0.1414
## Detection Rate 0.4949 0.2828 0.1212
## Detection Prevalence 0.5657 0.3030 0.1313
## Balanced Accuracy 0.9075 0.8964 0.9227
# Delete X from the names
colnames(mat_2_final) <- paste0("X", colnames(mat_2_final))
# Predict
predictions_new_rf_common <- predict(rf_model, newdata = mat_2_final)
# Align predictions with sample identifiers, assuming row names of mat_2 represent them
predictions_new_df_rf_common <- data.frame(prediction = as.character(predictions_new_rf_common), row.names = rownames(mat_2_final))
predictions_new_df_rf_common <- tibble::rownames_to_column(predictions_new_df_rf_common, var = "SAMPLE_ID")
predictions_with_class_rf <- predictions_new_df_rf_common %>%
left_join(df_meta %>% dplyr::select(SAMPLE_ID,RNASEQ_CLUSTER_CONSENHIER, OS_MONTHS, DECEASED), by = "SAMPLE_ID")
matching_counts_rf <- table(predictions_with_class_rf$RNASEQ_CLUSTER_CONSENHIER == predictions_with_class_rf$prediction)
matching_counts_rf
##
## FALSE TRUE
## 12 315
# Prepare the data frame for analysis
df <- predictions_with_class %>%
distinct(PATIENT_ID, .keep_all = T) %>%
filter(!is.na(DECEASED)) %>%
mutate(CLUSTER = prediction)
surv_obj <- Surv(time = df$OS_MONTHS/12, event = df$DECEASED)
# Fit Kaplan-Meier survival curve
km_fit <- survfit(surv_obj ~ df$CLUSTER)
ggsurvplot(km_fit,
conf.int = F, # Show confidence interval
risk.table = F, # Add a table of the number of subjects at risk at each time point
xlab = "Time (years)", # Customize X-axis label
ylab = "Cumulative Survival Rate", # Customize Y-axis label
ggtheme = theme_minimal(),
data = df) # Use a minimal theme for the plot
samples <- read.delim("~/TCGA_SKCM/skcm_tcga/data_clinical_sample.txt", skip = 4)
df_s <- unique(samples)
df_s <- df_s %>%
distinct(PATIENT_ID, .keep_all = TRUE) %>%
dplyr::select(PATIENT_ID, SAMPLE_ID, DAYS_TO_COLLECTION, SAMPLE_TYPE, TMB_NONSYNONYMOUS) %>%
mutate(METASTASIS = case_when(
SAMPLE_TYPE == "Primary" ~ 0,
SAMPLE_TYPE == "Metastasis" ~ 1
)) %>%
mutate(log10TMB = log10(TMB_NONSYNONYMOUS)) %>%
dplyr::select(-SAMPLE_TYPE, -TMB_NONSYNONYMOUS) %>%
mutate_all(~gsub("\\[Not Available\\]", NA, .)) %>%
mutate(DAYS_TO_COLLECTION = as.numeric(DAYS_TO_COLLECTION),
METASTASIS = as.numeric(METASTASIS),
log10TMB = as.numeric(log10TMB)
)
head(df_s)
## PATIENT_ID SAMPLE_ID DAYS_TO_COLLECTION METASTASIS log10TMB
## 1 TCGA-BF-A1PU TCGA-BF-A1PU-01 338 0 0.3862016
## 2 TCGA-BF-A1PV TCGA-BF-A1PV-01 106 0 0.9311187
## 3 TCGA-BF-A1PX TCGA-BF-A1PX-01 122 0 1.0114295
## 4 TCGA-BF-A1PZ TCGA-BF-A1PZ-01 97 0 0.8711836
## 5 TCGA-BF-A1Q0 TCGA-BF-A1Q0-01 75 0 1.2855573
## 6 TCGA-D3-A1Q1 TCGA-D3-A1Q1-06 2305 1 0.2304489
rm(samples)
patients <- read.delim("~/TCGA_SKCM/skcm_tcga/data_clinical_patient.txt", skip=4)
df_p <- unique(patients)
df_p <- df_p %>%
distinct(PATIENT_ID, .keep_all = TRUE) %>%
dplyr::select(PATIENT_ID, SEX, HEIGHT, WEIGHT, RACE, TUMOR_STATUS, CLARK_LEVEL_AT_DIAGNOSIS, AJCC_PATHOLOGIC_TUMOR_STAGE, OS_STATUS, OS_MONTHS, DFS_STATUS, DFS_MONTHS, AGE) %>%
mutate(MALE = case_when(
SEX == "Male" ~ 1,
SEX == "Female" ~ 0
)) %>%
mutate(CLARK_LEVEL = case_when(
CLARK_LEVEL_AT_DIAGNOSIS == "I" ~ 1,
CLARK_LEVEL_AT_DIAGNOSIS == "II" ~ 2,
CLARK_LEVEL_AT_DIAGNOSIS == "III" ~ 3,
CLARK_LEVEL_AT_DIAGNOSIS == "IV" ~ 4,
)) %>%
mutate(DECEASED = case_when(
OS_STATUS == "1:DECEASED" ~ 1,
OS_STATUS == "0:LIVING" ~ 0
)) %>%
mutate(RECURRED_PROGRESSED = case_when(
DFS_STATUS == "1:Recurred/Progressed" ~ 1,
DFS_STATUS == "0:DiseaseFree" ~ 0
))%>%
mutate(TUMOR_FREE = case_when(
TUMOR_STATUS == "TUMOR FREE" ~ 1,
TUMOR_STATUS == "WITH TUMOR" ~ 0
))%>%
dplyr::select(-SEX, -CLARK_LEVEL_AT_DIAGNOSIS,-OS_STATUS, -DFS_STATUS, -TUMOR_STATUS) %>%
mutate_all(~gsub("\\[Not Available\\]", NA, .)) %>%
mutate(HEIGHT = as.numeric(HEIGHT),
WEIGHT = as.numeric(WEIGHT),
OS_MONTHS = as.numeric(OS_MONTHS),
AGE = as.numeric(AGE),
DFS_MONTHS = as.numeric(DFS_MONTHS),
MALE = as.numeric(MALE),
CLARK_LEVEL = as.numeric(CLARK_LEVEL),
DECEASED = as.numeric(DECEASED),
RECURRED_PROGRESSED = as.numeric(RECURRED_PROGRESSED),
TUMOR_FREE = as.numeric(TUMOR_FREE)
)
head(df_p)
## PATIENT_ID HEIGHT WEIGHT RACE AJCC_PATHOLOGIC_TUMOR_STAGE OS_MONTHS
## 1 TCGA-3N-A9WB 175 78 WHITE Stage IA 17.02
## 2 TCGA-3N-A9WC 183 68 WHITE Stage IIA 66.43
## 3 TCGA-3N-A9WD 183 116 WHITE Stage IIIA 12.98
## 4 TCGA-BF-A1PU 160 58 WHITE Stage IIC 12.71
## 5 TCGA-BF-A1PV 160 70 WHITE Stage IIC 0.46
## 6 TCGA-BF-A1PX 175 78 WHITE Stage IIIB 9.26
## DFS_MONTHS AGE MALE CLARK_LEVEL DECEASED RECURRED_PROGRESSED TUMOR_FREE
## 1 16.00 71 1 3 1 1 0
## 2 56.01 82 1 4 0 1 0
## 3 10.05 82 1 3 1 1 0
## 4 15.90 46 0 3 0 1 1
## 5 0.46 74 0 4 0 0 1
## 6 NA 56 1 3 1 NA 1
rm(patients)
df_meta <-
left_join(df_s, df_p, by = "PATIENT_ID") %>%
filter(SAMPLE_ID != "TCGA-GN-A261-06") %>%
filter(SAMPLE_ID != "TCGA-GN-A269-01") %>%
filter(AJCC_PATHOLOGIC_TUMOR_STAGE %in% c("Stage IV"))
df_meta_copy <- df_meta
rm(df_s, df_p)
dim(df_meta)
## [1] 23 17
# Read and preprocess the data
df_t <-
read_delim("~/TCGA_SKCM/skcm_tcga/data_mrna_seq_v2_rsem.txt") %>%
dplyr::select(-Hugo_Symbol) %>%
dplyr::select(Entrez_Gene_Id, df_meta$SAMPLE_ID) %>%
distinct(Entrez_Gene_Id, .keep_all = TRUE)
# Transpose and set row and column names
mat <- t(df_t[-1])
colnames(mat) <- df_t$Entrez_Gene_Id
rownames(mat) <- colnames(df_t)[-1]
# Log Transformation
mat <- log2(mat + 1)
# Feature selection based on variability
features <-
apply(mat, 2, mad) %>%
sort(decreasing = T) %>%
.[1:3000] %>%
names()
# Normalize the data
mat <- apply(mat, 2, function(x) (x - mean(x)))
# Reduce the matrix to selected features
mat <- mat[, features]
smpl.dend <- t(mat) %>%
cor(method = "pearson") %>%
{as.dist(1 - .)} %>%
{hclust(., method = "average")} %>%
as.dendrogram()
plot(smpl.dend)
gene.dend <- mat %>%
cor(method = "pearson") %>%
{as.dist(1 - .)} %>%
{hclust(., method = "ward.D2")} %>%
as.dendrogram()
k.smpl <- 4
k.gene <- 4
color <- c("#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00")
smpl.dend <- color_branches(smpl.dend, k = k.smpl, col = color[1:k.smpl])
gene.dend <- color_branches(gene.dend, k = k.gene, col = color[1:k.gene])
gene.grp <- cutree(gene.dend, k = k.gene)
gene.grp <- case_when(gene.grp == 1 ~ "G1",
gene.grp == 2 ~ "G2",
gene.grp == 3 ~ "G3",
gene.grp == 4 ~ "G4")
ck.bp <- compareCluster(geneCluster = list("G1" = colnames(mat)[gene.grp == "G1"],
"G2" = colnames(mat)[gene.grp == "G2"],
"G3" = colnames(mat)[gene.grp == "G3"],
"G4" = colnames(mat)[gene.grp == "G4"]),
fun = "enrichGO", OrgDb = "org.Hs.eg.db", ont = "BP",
universe = colnames(mat),
pvalueCutoff = 0.01, qvalueCutoff = 1)
GO <- ck.bp@compareClusterResult %>%
as_tibble() %>%
group_by(Cluster) %>%
arrange(p.adjust) %>%
slice_head(n = 5)
text_list = list(
text1 = filter(GO, Cluster == "G1")$Description,
text2 = filter(GO, Cluster == "G2")$Description,
text3 = filter(GO, Cluster == "G3")$Description,
text4 = filter(GO, Cluster == "G4")$Description
)
ht_opt$message = FALSE
my_palette <- brewer.pal(n = 5, name = "Set1")
gene.ha <- rowAnnotation(foo = anno_empty(border = FALSE, width = max_text_width(unlist(text_list)) + unit(4, "mm")))
p <- Heatmap(t(mat),
show_row_names = F, show_column_names = F,
name = "Gene expression",
right_annotation = gene.ha,
row_title = 'G%s', column_title = 'S%s',
row_dend_width = unit(10, "mm"),
column_dend_height = unit(10, "mm"),
show_heatmap_legend = FALSE,
cluster_rows = gene.dend,
cluster_columns = smpl.dend,
row_split = k.gene,
column_split = k.smpl)
png("heatmap_BG_3.png", width = 9, height = 6, units = "in", res = 300)
draw(p)
for(i in 1:k.gene) {
decorate_annotation("foo", slice = i, {
grid.rect(x = 0, width = unit(2, "mm"),
gp = gpar(fill = color[i], col = NA, fontsize = 10), just = "left")
grid.text(paste(text_list[[i]], collapse = "\n"),
x = unit(4, "mm"), just = "left")
})
}
dev.off()
## quartz_off_screen
## 2
smpl.grp <- cutree(smpl.dend, k = k.smpl)
fviz_cluster(list(data = mat, cluster = smpl.grp),
palette = "Set1",
geom = "point",
ellipse.type = "convex",
show.clust.cent = FALSE,
main = "Sample clusters",
ggtheme = theme_minimal())
gene.grp <- cutree(gene.dend, k = k.gene)
fviz_cluster(list(data = t(mat), cluster = gene.grp),
palette = "Set1",
geom = "point",
ellipse.type = "convex",
show.clust.cent = FALSE,
main = "Gene clusters",
ggtheme = theme_minimal())
mat_df <- as.data.frame(mat)
mat_df$labels <- as.factor(smpl.grp)
# Ensure that labels are factors with appropriate levels
mat_df$labels <- factor(mat_df$labels, levels = c("1", "2", "3", "4"))
# Select genes
genes <- colnames(mat_df)
# Perform Kruskal-Wallis test for each gene
res <- sapply(genes, function(x) {
model <- kruskal.test(mat_df[[x]] ~ mat_df$labels)
p <- model$p.value
return(p)
})
# Set names for the result vector
names(res) <- genes
# Adjust p-values for multiple testing using Benjamini-Hochberg procedure
adjusted_p <- p.adjust(res, method = "BH")
# Define a significance threshold (e.g., 0.05)
threshold <- 0.1
# Identify significant genes for each label
significant_genes <- list()
for (label in levels(mat_df$labels)) {
sig_genes <- names(adjusted_p)[adjusted_p < threshold & mat_df$labels == label]
significant_genes[[paste("Label", label, sep = "_")]] <- sig_genes
}
symbols_1 <- AnnotationDbi::select(org.Hs.eg.db,
keys = as.character(significant_genes[[1]]),
columns = c("SYMBOL"),
keytype = "ENTREZID") %>% na.omit()
values_to_query <- symbols_1$ENTREZID
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
genes_info_1 <- getBM(attributes = c('gene_biotype', 'entrezgene_id', 'external_gene_name',
'description'),
filters = 'entrezgene_id',
values = values_to_query,
mart = ensembl)
head(genes_info_1)
## gene_biotype entrezgene_id external_gene_name
## 1 lncRNA 100128385 FAM225B
## 2 protein_coding 1004 CDH6
## 3 protein_coding 10382 TUBB4A
## 4 protein_coding 10406 WFDC2
## 5 protein_coding 10630 PDPN
## 6 protein_coding 1089 CEACAM4
## description
## 1 family with sequence similarity 225 member B [Source:HGNC Symbol;Acc:HGNC:21865]
## 2 cadherin 6 [Source:HGNC Symbol;Acc:HGNC:1765]
## 3 tubulin beta 4A class IVa [Source:HGNC Symbol;Acc:HGNC:20774]
## 4 WAP four-disulfide core domain 2 [Source:HGNC Symbol;Acc:HGNC:15939]
## 5 podoplanin [Source:HGNC Symbol;Acc:HGNC:29602]
## 6 CEA cell adhesion molecule 4 [Source:HGNC Symbol;Acc:HGNC:1816]
ego1 <- enrichGO(gene = symbols_1$SYMBOL,
OrgDb = org.Hs.eg.db,
keyType = "SYMBOL",
ont = "BP",
pAdjustMethod = "BH",
qvalueCutoff = 0.1)
png("~/TCGA_SKCM/dot_plot_7.png", width = 1200, height = 800)
clusterProfiler::dotplot(ego1, showCategory = 20)
dev.off()
## quartz_off_screen
## 2
symbols_2 <- AnnotationDbi::select(org.Hs.eg.db,
keys = as.character(significant_genes[[2]]),
columns = c("SYMBOL"),
keytype = "ENTREZID") %>% na.omit()
values_to_query_2 <- symbols_2$ENTREZID
genes_info_2 <- getBM(attributes = c('gene_biotype', 'entrezgene_id', 'external_gene_name',
'description'),
filters = 'entrezgene_id',
values = values_to_query_2,
mart = ensembl)
head(genes_info_2)
## gene_biotype entrezgene_id external_gene_name
## 1 protein_coding 10004 NAALADL1
## 2 protein_coding 100049587 SIGLEC14
## 3 protein_coding 10103 TSPAN1
## 4 protein_coding 10202 DHRS2
## 5 protein_coding 1036 CDO1
## 6 protein_coding 10411 RAPGEF3
## description
## 1 N-acetylated alpha-linked acidic dipeptidase like 1 [Source:HGNC Symbol;Acc:HGNC:23536]
## 2 sialic acid binding Ig like lectin 14 [Source:HGNC Symbol;Acc:HGNC:32926]
## 3 tetraspanin 1 [Source:HGNC Symbol;Acc:HGNC:20657]
## 4 dehydrogenase/reductase 2 [Source:HGNC Symbol;Acc:HGNC:18349]
## 5 cysteine dioxygenase type 1 [Source:HGNC Symbol;Acc:HGNC:1795]
## 6 Rap guanine nucleotide exchange factor 3 [Source:HGNC Symbol;Acc:HGNC:16629]
ego2 <- enrichGO(gene = symbols_2$SYMBOL,
OrgDb = org.Hs.eg.db,
keyType = "SYMBOL",
ont = "BP",
pAdjustMethod = "BH",
qvalueCutoff = 0.1)
png("~/TCGA_SKCM/dot_plot_8.png", width = 1200, height = 800)
clusterProfiler::dotplot(ego2, showCategory = 20)
dev.off()
## quartz_off_screen
## 2
symbols_3 <- AnnotationDbi::select(org.Hs.eg.db,
keys = as.character(significant_genes[[3]]),
columns = c("SYMBOL"),
keytype = "ENTREZID") %>% na.omit()
values_to_query_3 <- symbols_3$ENTREZID
genes_info_3 <- getBM(attributes = c('gene_biotype', 'entrezgene_id', 'external_gene_name',
'description'),
filters = 'entrezgene_id',
values = values_to_query_3,
mart = ensembl)
head(genes_info_3)
## gene_biotype entrezgene_id external_gene_name
## 1 unprocessed_pseudogene 100132417 FCGR1CP
## 2 protein_coding 10234 LRRC17
## 3 protein_coding 10278 EFS
## 4 protein_coding 10351 ABCA8
## 5 protein_coding 10389 SCML2
## 6 protein_coding 10398 MYL9
## description
## 1 Fc gamma receptor Ic, pseudogene [Source:HGNC Symbol;Acc:HGNC:3615]
## 2 leucine rich repeat containing 17 [Source:HGNC Symbol;Acc:HGNC:16895]
## 3 embryonal Fyn-associated substrate [Source:HGNC Symbol;Acc:HGNC:16898]
## 4 ATP binding cassette subfamily A member 8 [Source:HGNC Symbol;Acc:HGNC:38]
## 5 Scm polycomb group protein like 2 [Source:HGNC Symbol;Acc:HGNC:10581]
## 6 myosin light chain 9 [Source:HGNC Symbol;Acc:HGNC:15754]
ego3 <- enrichGO(gene = symbols_3$SYMBOL,
OrgDb = org.Hs.eg.db,
keyType = "SYMBOL",
ont = "BP",
pAdjustMethod = "BH",
qvalueCutoff = 0.05)
png("~/TCGA_SKCM/dot_plot_9.png", width = 1200, height = 800)
clusterProfiler::dotplot(ego3, showCategory = 20)
dev.off()
## quartz_off_screen
## 2
symbols_4 <- AnnotationDbi::select(org.Hs.eg.db,
keys = as.character(significant_genes[[4]]),
columns = c("SYMBOL"),
keytype = "ENTREZID") %>% na.omit()
values_to_query_4 <- symbols_4$ENTREZID
# Expand attributes to include functional information
genes_info_4 <- getBM(attributes = c('gene_biotype', 'entrezgene_id', 'external_gene_name',
'description'),
filters = 'entrezgene_id',
values = values_to_query_4,
mart = ensembl)
# Check the first few rows to confirm it worked as expected
head(genes_info_4)
## gene_biotype entrezgene_id external_gene_name
## 1 lncRNA 100129396 FAM106C
## 2 lncRNA 100233209 PCED1B-AS1
## 3 protein_coding 10085 EDIL3
## 4 protein_coding 10235 RASGRP2
## 5 protein_coding 1043 CD52
## 6 protein_coding 10578 GNLY
## description
## 1 family with sequence similarity 106 member C [Source:HGNC Symbol;Acc:HGNC:38396]
## 2 PCED1B antisense RNA 1 [Source:HGNC Symbol;Acc:HGNC:44166]
## 3 EGF like repeats and discoidin domains 3 [Source:HGNC Symbol;Acc:HGNC:3173]
## 4 RAS guanyl releasing protein 2 [Source:HGNC Symbol;Acc:HGNC:9879]
## 5 CD52 molecule [Source:HGNC Symbol;Acc:HGNC:1804]
## 6 granulysin [Source:HGNC Symbol;Acc:HGNC:4414]
ego4 <- enrichGO(gene = symbols_4$SYMBOL,
OrgDb = org.Hs.eg.db,
keyType = "SYMBOL",
ont = "BP",
pAdjustMethod = "BH",
qvalueCutoff = 0.05)
png("~/TCGA_SKCM/dot_plot_10.png", width = 1200, height = 800)
clusterProfiler::dotplot(ego4, showCategory = 20)
dev.off()
## quartz_off_screen
## 2
# Read the data
correlation <- read_excel("~/TCGA_SKCM/correlation.xlsx")
# Data type conversion and column selection
correlation <- correlation %>%
mutate(
`Normal-like centroid` = as.numeric(`Normal-like centroid`),
`Proliferative centroid` = as.numeric(`Proliferative centroid`),
`Pigmentation centroid` = as.numeric(`Pigmentation centroid`),
`High-immune centroid` = as.numeric(`High-immune centroid`)
) %>%
dplyr::select(-Probe, -ProbeID)
head(correlation)
## # A tibble: 6 × 5
## SYMBOL `Normal-like centroid` Proliferative centroi…¹ Pigmentation centroi…²
## <chr> <dbl> <dbl> <dbl>
## 1 FCHSD2 -0.179 -0.233 -0.349
## 2 TBC1D7 -0.471 -0.661 1.07
## 3 MBNL1 0.329 -0.582 -0.188
## 4 RASGRP2 -0.608 -0.480 -0.565
## 5 MARVELD2 2.56 -0.741 -0.437
## 6 CDO1 -0.444 -0.626 -0.664
## # ℹ abbreviated names: ¹`Proliferative centroid`, ²`Pigmentation centroid`
## # ℹ 1 more variable: `High-immune centroid` <dbl>
# Select numeric correlation data
numeric_correlation <- correlation %>%
dplyr::select(-SYMBOL)
# Classify rows based on maximum value
group <- apply(numeric_correlation, 1, which.max)
# Assign descriptive labels
group_labels <- c("normal_like", "proliferative", "pigmentation", "high_immune")
named_closest_group <- group_labels[group]
# Merge
merge_groups <- cbind(correlation, named_closest_group)
df_t_2 <-
read_tsv("~/TCGA_SKCM/skcm_tcga/data_mrna_seq_v2_rsem.txt") %>%
dplyr::select(-Entrez_Gene_Id) %>%
dplyr::select(Hugo_Symbol, df_meta$SAMPLE_ID) %>%
distinct(Hugo_Symbol, .keep_all = TRUE) %>% na.omit()
mat_2 <- t(df_t_2[-1])
colnames(mat_2) <- df_t_2$Hugo_Symbol
rownames(mat_2) <- colnames(df_t_2)[-1]
mat <- log2(mat + 1)
features <-
apply(mat_2, 2, mad) %>%
sort(decreasing = T) %>%
.[1:3000] %>%
names()
mat <- apply(mat, 2, function(x) (x - mean(x))/mad(x))
mat_2 <- mat_2[, features]
# Split by names of groups
list_of_merge_groups <- split(merge_groups, merge_groups$named_closest_group)
# Get Genes symbols
mat_2 <- data.frame(mat_2)
symbols <- colnames(mat_2)
high_immune <- intersect(list_of_merge_groups[[1]][[1]], symbols)
high_immune
## [1] "MBNL1" "DHRS3" "STAB1" "VWF" "CXCL12" "TNFRSF1B"
## [7] "TGFBR2" "TAGLN" "EPAS1" "GBP4" "RFTN1" "ADD3"
## [13] "SHANK3" "C3" "PLVAP"
normal_like <- intersect(list_of_merge_groups[[2]][[1]], symbols)
normal_like
## [1] "TAPBP" "PTGDS" "DST" "JUP" "PERP" "DEGS1"
pigmentation <- intersect(list_of_merge_groups[[3]][[1]], symbols)
pigmentation
## [1] "TBC1D7" "PLXNC1" "SORT1" "MREG" "SDCBP"
## [6] "LYST" "CDK2" "IRF4" "TSPAN10" "EDNRB"
## [11] "MYO5A" "SLC3A2" "TIMP2" "TTYH2" "SLC24A5"
## [16] "RGS20" "ST6GALNAC2" "STXBP1" "GPR143" "TYR"
## [21] "SLC45A2" "MRPL44" "DCT" "SYNGR1" "SOX13"
## [26] "GPNMB" "NINJ1" "PAX3" "MLANA" "STX7"
## [31] "GMPR" "SLC7A5" "PCDH7" "WDFY1" "ANKRD28"
## [36] "ANXA5" "CABLES1" "TRIM63" "AP1S2" "H2AFZ"
## [41] "PIGY" "MBNL2" "OSBPL9" "S100A13" "SGCD"
proliferation <- intersect(list_of_merge_groups[[4]][[1]], symbols)
proliferation
## [1] "SPRY2" "PRMT1" "PPFIBP1" "CDK6" "GPSM1"
smpl.grp <- cutree(smpl.dend, k = k.smpl)
smpl.grp <- case_when(smpl.grp == 1 ~ "normal_like",
smpl.grp == 2 ~ "pigmintation",
smpl.grp == 3 ~ "prolifirative",
smpl.grp == 4 ~ "high_immune")
df <- df_meta %>%
mutate(CLUSTER = as.factor(smpl.grp)) %>%
distinct(PATIENT_ID, .keep_all = T) %>%
filter(!is.na(DECEASED))
# Convert OS_MONTHS from months to weeks
weeks = df$OS_MONTHS * 4.345
# Cap the survival time at 100 weeks
weeks_capped = pmin(weeks, 100)
# Adjust the event indicator
# Event is censored (0) if original time is greater than 100 weeks, regardless of original event status
event_capped = ifelse(weeks > 100, 0, df$DECEASED)
# Create the survival object with capped time and adjusted event
surv_obj <- Surv(time = weeks_capped, event = event_capped)
# Fit Kaplan-Meier survival curve
km_fit <- survfit(surv_obj ~ df$CLUSTER)
# Updated ggsurvplot call to display y-axis labels as percentages
ggsurvplot(km_fit,
conf.int = FALSE, # Do not show confidence interval
risk.table = FALSE, # Do not add a table of the number of subjects at risk at each time point
xlab = "Time (weeks)", # Customize X-axis label to indicate the time unit
ylab = "Survival Fraction", # Customize Y-axis label
ggtheme = theme_minimal(), # Use a minimal theme for the plot
data = df)